1 Introduction

1.1 Loading libraries

library('dplyr') # data manipulation
library('reshape2') # data manipulation
library('ggplot2') # visualization
library('knitr')
library('plotly') # visualization
library('corrplot') # package corrplot
library('caret') # data mining
library('zoo') # NA values approximation

1.2 Loading data

ds <- read.csv(url("http://www.cs.put.poznan.pl/dbrzezinski/teaching/zed/sledzie.csv"),na.strings = c("NA","NaN", " ","?") )
ds <- tbl_df(ds)

1.3 Explanation of attributes

X may be considered as main Id and indicates chronological order. The lower the value the older is the sample.

Variable Name Description Units
length length of herring [cm]
cfin1 accessibility of plankton [concentration of Calanus finmarchicus spec. 1];
cfin2 accessibility of plankton [concentration of Calanus finmarchicus spec. 2];
chel1 accessibility of plankton [concentration of Calanus helgolandicus spec. 1];
chel2 accessibility of plankton [concentration of Calanus helgolandicus spec. 2];
lcop1 accessibility of plankton [concentration of Copepoda spec. 1];
lcop2 accessibility of plankton [concentration of Copepoda spec. 2];
fbar intensity of fishing in region [fraction of preserved fry];
recr annual fry [number of herrings];
cumf total annual intensity of fishing in region [fraction of preserved fry];
totaln total number of fishes catched during a single fishing [number of herrings];
sst temperature of the water at the surface [°C];
sal level of salinity [Knudsen ppt];
xmonth month of fishing [number of month];
nao North Atlantic oscillation (difference of atmospheric pressure at sea level between the Icelandic low and the Azores high) [mb]

2 Exploration

2.1 Basic statistics

Empty values are not significant.

knitr::kable(summary(ds))
X length cfin1 cfin2 chel1 chel2 lcop1 lcop2 fbar recr cumf totaln sst sal xmonth nao
Min. : 0 Min. :19.0 Min. : 0.0000 Min. : 0.0000 Min. : 0.000 Min. : 5.238 Min. : 0.3074 Min. : 7.849 Min. :0.0680 Min. : 140515 Min. :0.06833 Min. : 144137 Min. :12.77 Min. :35.40 Min. : 1.000 Min. :-4.89000
1st Qu.:13145 1st Qu.:24.0 1st Qu.: 0.0000 1st Qu.: 0.2778 1st Qu.: 2.469 1st Qu.:13.427 1st Qu.: 2.5479 1st Qu.:17.808 1st Qu.:0.2270 1st Qu.: 360061 1st Qu.:0.14809 1st Qu.: 306068 1st Qu.:13.60 1st Qu.:35.51 1st Qu.: 5.000 1st Qu.:-1.89000
Median :26291 Median :25.5 Median : 0.1111 Median : 0.7012 Median : 5.750 Median :21.673 Median : 7.0000 Median :24.859 Median :0.3320 Median : 421391 Median :0.23191 Median : 539558 Median :13.86 Median :35.51 Median : 8.000 Median : 0.20000
Mean :26291 Mean :25.3 Mean : 0.4458 Mean : 2.0248 Mean :10.006 Mean :21.221 Mean : 12.8108 Mean :28.419 Mean :0.3304 Mean : 520367 Mean :0.22981 Mean : 514973 Mean :13.87 Mean :35.51 Mean : 7.258 Mean :-0.09236
3rd Qu.:39436 3rd Qu.:26.5 3rd Qu.: 0.3333 3rd Qu.: 1.7936 3rd Qu.:11.500 3rd Qu.:27.193 3rd Qu.: 21.2315 3rd Qu.:37.232 3rd Qu.:0.4560 3rd Qu.: 724151 3rd Qu.:0.29803 3rd Qu.: 730351 3rd Qu.:14.16 3rd Qu.:35.52 3rd Qu.: 9.000 3rd Qu.: 1.63000
Max. :52581 Max. :32.5 Max. :37.6667 Max. :19.3958 Max. :75.000 Max. :57.706 Max. :115.5833 Max. :68.736 Max. :0.8490 Max. :1565890 Max. :0.39801 Max. :1015595 Max. :14.73 Max. :35.61 Max. :12.000 Max. : 5.08000
NA NA NA’s :1581 NA’s :1536 NA’s :1555 NA’s :1556 NA’s :1653 NA’s :1591 NA NA NA NA NA’s :1584 NA NA NA

2.2 Data distribution

d <- melt(ds[,-c(1)])
## No id variables; using all as measure variables
ggplot(d,aes(x = value)) + 
    facet_wrap(~variable,scales = "free_x") + 
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

2.3 What is the length and time correlation ?

2.3.1 Full history

p1 <- ggplot(ds,aes(X,length)) + geom_point(alpha=0.04) + geom_smooth(method="auto", se=TRUE, color="red")

plt1 <- htmltools::tagList()

plt1 <- as.widget(ggplotly(p1)) 
## `geom_smooth()` using method = 'gam'
plt1

2.3.2 History of length in months

p2 <- ggplot(ds,aes(X,length, group = xmonth)) + geom_point(stat = "identity",alpha=0.03 ) + geom_smooth(method="auto", se=TRUE,color="red") + facet_grid(~xmonth, scales = "fixed") + facet_wrap(~ xmonth, ncol = 3,labeller = month_labeller) + theme_minimal()

plt2 <- htmltools::tagList()

plt2 <- as.widget(ggplotly(p2)) 
## `geom_smooth()` using method = 'gam'
plt2

2.4 Preparing data

Data set is split into training and test data set.

set.seed(1993)

ds2 <- ds %>% select(-c(X))

inTraining <- 
    createDataPartition(
        y = ds2$length,
        p = .75,
        list = FALSE)

training <- ds2[ inTraining,]
testing  <- ds2[-inTraining,]

2.5 Handling NAs nulls and emptities

Just by looking through the data low standard deviation per month for every attribute can be noticed. Many values seems to be duplicated or changed insignificantly. Simple approximation is used to determine missing values. Every row where value of NA was not determined by this method is deleted. Testing set is reduced to complete cases only.

set.seed(1993)

for(i in 1:ncol(training)){
  if(anyNA(ds2[,i])){
    training[,i] <- na.approx(training[,i],na.rm = F,maxgap = 5)
  }
}


training <- training[complete.cases(training),]
testing <- testing[complete.cases(testing),]


knitr::kable(summary(training))
length cfin1 cfin2 chel1 chel2 lcop1 lcop2 fbar recr cumf totaln sst sal xmonth nao
Min. :19.00 Min. : 0.0000 Min. : 0.0000 Min. : 0.000 Min. : 5.238 Min. : 0.3074 Min. : 7.849 Min. :0.0680 Min. : 140515 Min. :0.06833 Min. : 144137 Min. :12.77 Min. :35.40 Min. : 1.000 Min. :-4.89000
1st Qu.:24.00 1st Qu.: 0.0000 1st Qu.: 0.2778 1st Qu.: 2.469 1st Qu.:13.427 1st Qu.: 2.5479 1st Qu.:17.808 1st Qu.:0.2270 1st Qu.: 360061 1st Qu.:0.14809 1st Qu.: 306068 1st Qu.:13.60 1st Qu.:35.51 1st Qu.: 5.000 1st Qu.:-1.89000
Median :25.50 Median : 0.1111 Median : 0.7012 Median : 5.750 Median :21.435 Median : 7.0000 Median :24.859 Median :0.3320 Median : 421391 Median :0.23191 Median : 539558 Median :13.86 Median :35.51 Median : 8.000 Median : 0.20000
Mean :25.31 Mean : 0.4419 Mean : 2.0100 Mean :10.015 Mean :21.198 Mean : 12.8117 Mean :28.386 Mean :0.3306 Mean : 519930 Mean :0.22965 Mean : 515111 Mean :13.87 Mean :35.51 Mean : 7.257 Mean :-0.09418
3rd Qu.:26.50 3rd Qu.: 0.3333 3rd Qu.: 1.7936 3rd Qu.:11.500 3rd Qu.:27.193 3rd Qu.: 21.2315 3rd Qu.:37.232 3rd Qu.:0.4650 3rd Qu.: 724151 3rd Qu.:0.29803 3rd Qu.: 730351 3rd Qu.:14.21 3rd Qu.:35.52 3rd Qu.: 9.000 3rd Qu.: 1.63000
Max. :32.50 Max. :37.6667 Max. :19.3958 Max. :75.000 Max. :57.706 Max. :115.5833 Max. :68.736 Max. :0.8490 Max. :1565890 Max. :0.39801 Max. :1015595 Max. :14.73 Max. :35.61 Max. :12.000 Max. : 5.08000
knitr::kable(summary(testing))
length cfin1 cfin2 chel1 chel2 lcop1 lcop2 fbar recr cumf totaln sst sal xmonth nao
Min. :19.50 Min. : 0.0000 Min. : 0.0000 Min. : 0.000 Min. : 5.238 Min. : 0.3074 Min. : 7.849 Min. :0.0680 Min. : 140515 Min. :0.06833 Min. : 144137 Min. :12.77 Min. :35.40 Min. : 1.000 Min. :-4.8900
1st Qu.:24.00 1st Qu.: 0.0000 1st Qu.: 0.2671 1st Qu.: 2.469 1st Qu.:13.427 1st Qu.: 2.5479 1st Qu.:17.808 1st Qu.:0.2270 1st Qu.: 360061 1st Qu.:0.14809 1st Qu.: 306068 1st Qu.:13.63 1st Qu.:35.51 1st Qu.: 6.000 1st Qu.:-1.8900
Median :25.50 Median : 0.1111 Median : 0.7012 Median : 5.750 Median :21.673 Median : 7.0717 Median :24.859 Median :0.3320 Median : 421391 Median :0.23191 Median : 539558 Median :13.86 Median :35.51 Median : 8.000 Median : 0.2000
Mean :25.31 Mean : 0.4526 Mean : 2.0474 Mean : 9.948 Mean :21.251 Mean : 12.7925 Mean :28.450 Mean :0.3297 Mean : 521438 Mean :0.23005 Mean : 514946 Mean :13.88 Mean :35.51 Mean : 7.252 Mean :-0.0918
3rd Qu.:26.50 3rd Qu.: 0.3333 3rd Qu.: 1.7936 3rd Qu.:11.500 3rd Qu.:27.193 3rd Qu.: 21.2315 3rd Qu.:37.232 3rd Qu.:0.4560 3rd Qu.: 724151 3rd Qu.:0.29803 3rd Qu.: 730351 3rd Qu.:14.16 3rd Qu.:35.52 3rd Qu.: 9.000 3rd Qu.: 1.6300
Max. :32.00 Max. :37.6667 Max. :19.3958 Max. :75.000 Max. :57.706 Max. :115.5833 Max. :68.736 Max. :0.8490 Max. :1565890 Max. :0.39801 Max. :1015595 Max. :14.73 Max. :35.61 Max. :12.000 Max. : 5.0800

2.6 Attribute correlation

Length attribute seems to be negatively correlated to time and North Atlantic oscillation but is positively correlated to intensity of fishing in region. Moreover length is negatively correlated to temperature of the water at the surface. Other correlations confirms validity of data. For example total number of fishes catched during a single fishing is correlated to total annual intensity of fishing in region.

knitr::kable(cor(training))
X length cfin1 cfin2 chel1 chel2 lcop1 lcop2 fbar recr cumf totaln sst sal xmonth nao
X 1.0000000 -0.3384877 -0.1497344 0.0625020 -0.1655801 0.0571219 -0.2290329 0.0423353 0.0965635 0.0006595 0.2353515 -0.3626289 0.3613572 -0.0520178 -0.0031134 0.4127596
length -0.3384877 1.0000000 0.0805045 0.0939675 0.2196150 -0.0127203 0.2362724 0.0483279 0.2534052 -0.0115559 0.0080335 0.0922081 -0.4529849 0.0288068 0.0142225 -0.2612139
cfin1 -0.1497344 0.0805045 1.0000000 0.1543359 0.0872516 0.1998542 0.1113421 0.2107115 -0.0665129 0.1159156 -0.0494736 0.1298700 0.0114949 0.1308075 0.0057650 0.0084740
cfin2 0.0625020 0.0939675 0.1543359 1.0000000 -0.0047987 0.3046562 -0.0408257 0.6495371 0.1493054 -0.0939349 0.3315717 -0.2125897 -0.2313141 -0.0810768 0.0124063 -0.0005043
chel1 -0.1655801 0.2196150 0.0872516 -0.0047987 1.0000000 0.2854820 0.9556356 0.2480208 0.1630038 -0.0499888 0.0695477 0.1659248 -0.2149789 -0.1486619 0.0452955 -0.5066948
chel2 0.0571219 -0.0127203 0.1998542 0.3046562 0.2854820 1.0000000 0.1752773 0.8845860 0.0280526 0.0009182 0.2614306 -0.3766883 0.0126750 -0.2206494 0.0682363 -0.0546246
lcop1 -0.2290329 0.2362724 0.1113421 -0.0408257 0.9556356 0.1752773 1.0000000 0.1511895 0.0989205 0.0001623 -0.0094285 0.2648605 -0.2638250 -0.1020682 0.0319319 -0.5520941
lcop2 0.0423353 0.0483279 0.2107115 0.6495371 0.2480208 0.8845860 0.1511895 1.0000000 0.0510242 0.0020792 0.2874762 -0.3007467 -0.1143536 -0.1826004 0.0587263 -0.0394735
fbar 0.0965635 0.2534052 -0.0665129 0.1493054 0.1630038 0.0280526 0.0989205 0.0510242 1.0000000 -0.2382844 0.8181077 -0.5087197 -0.1747903 0.0422798 0.0059388 0.0623021
recr 0.0006595 -0.0115559 0.1159156 -0.0939349 -0.0499888 0.0009182 0.0001623 0.0020792 -0.2382844 1.0000000 -0.2595413 0.3705363 -0.2014227 0.2745302 0.0218885 0.0917721
cumf 0.2353515 0.0080335 -0.0494736 0.3315717 0.0695477 0.2614306 -0.0094285 0.2874762 0.8181077 -0.2595413 1.0000000 -0.7077117 0.0345575 -0.0993258 0.0341512 0.2248259
totaln -0.3626289 0.0922081 0.1298700 -0.2125897 0.1659248 -0.3766883 0.2648605 -0.3007467 -0.5087197 0.3705363 -0.7077117 1.0000000 -0.2887172 0.1465778 -0.0274147 -0.3908094
sst 0.3613572 -0.4529849 0.0114949 -0.2313141 -0.2149789 0.0126750 -0.2638250 -0.1143536 -0.1747903 -0.2014227 0.0345575 -0.2887172 1.0000000 0.0124690 -0.0095941 0.5106722
sal -0.0520178 0.0288068 0.1308075 -0.0810768 -0.1486619 -0.2206494 -0.1020682 -0.1826004 0.0422798 0.2745302 -0.0993258 0.1465778 0.0124690 1.0000000 -0.0230910 0.1287458
xmonth -0.0031134 0.0142225 0.0057650 0.0124063 0.0452955 0.0682363 0.0319319 0.0587263 0.0059388 0.0218885 0.0341512 -0.0274147 -0.0095941 -0.0230910 1.0000000 -0.0035912
nao 0.4127596 -0.2612139 0.0084740 -0.0005043 -0.5066948 -0.0546246 -0.5520941 -0.0394735 0.0623021 0.0917721 0.2248259 -0.3908094 0.5106722 0.1287458 -0.0035912 1.0000000
corrplot(cor(training))

2.7 Regression

Building regression model is checked with two measures RMSE and R^2. Best regression model was built using random forest algorithm.

set.seed(1993)

ctrl <- trainControl(
    method = "repeatedcv",
    number = 2,
    repeats = 5)

set.seed(1993)

rf_fit <- train(length ~ .,
             data = training,
             method = "rf",
             trControl = ctrl,
             importance = TRUE,
             ntree = 5)

rf_pred <- predict(rf_fit, testing)

rf_fit
## Random Forest 
## 
## 39437 samples
##    15 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (2 fold, repeated 5 times) 
## Summary of sample sizes: 19719, 19718, 19719, 19718, 19719, 19718, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared 
##    2    1.139878  0.5256267
##    8    1.117765  0.5471589
##   15    1.211110  0.4952169
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 8.
postResample(pred = rf_pred, obs = testing$length) 
##      RMSE  Rsquared 
## 1.1004934 0.5571783
rf_fit2 <- train(length ~ .,
             data = training,
             method = "rf",
             trControl = ctrl,
             importance = TRUE,
             
             ntree = 15)

rf_pred2 <- predict(rf_fit2, testing)

rf_fit2
## Random Forest 
## 
## 39437 samples
##    15 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (2 fold, repeated 5 times) 
## Summary of sample sizes: 19718, 19719, 19718, 19719, 19719, 19718, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared 
##    2    1.134458  0.5302668
##    8    1.098260  0.5610723
##   15    1.185089  0.5093636
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 8.
postResample(pred = rf_pred2, obs = testing$length) 
##      RMSE  Rsquared 
## 1.0897997 0.5647872

2.8 Attribute significance

Results for ntree = 5

imp1 <- varImp(rf_fit$finalModel, scale=FALSE)

knitr::kable(imp1)
Overall
X 1.7082357
cfin1 0.4006010
cfin2 0.4523616
chel1 0.2752032
chel2 0.3707563
lcop1 0.5228276
lcop2 0.4337889
fbar 0.5993601
recr 0.5960982
cumf 0.4100346
totaln 0.6370218
sst 2.0892154
sal 0.0795844
xmonth 0.4957700
nao 0.1783721

Results for ntree = 15

imp2 <- varImp(rf_fit2$finalModel, scale=FALSE)

knitr::kable(imp2)
Overall
X 1.4489873
cfin1 0.4189872
cfin2 0.4587345
chel1 0.3251824
chel2 0.5871283
lcop1 0.5511939
lcop2 0.4827812
fbar 0.6947083
recr 0.5904054
cumf 0.2609754
totaln 0.4123031
sst 1.9644069
sal 0.2135060
xmonth 0.4683507
nao 0.1255477

2.9 Conclusion

The attributes with impact for length of herrings are temperature of the water at the surface and north Atlantic oscillation.